Quality Control & EDA

All samples

Sample clustering

sampleDists <- dist(t(assay(rld)))
plot(hclust(sampleDists))

Correlation Heatmap

annot = dplyr::select(experimental_metadata, condition)
row.names(annot) = experimental_metadata$sample_id
rld %>%
  assay() %>%
  cor() %>%
  pheatmap(color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
           annotation = annot,
           annotation_colors = list(condition = c(Control = "#BF3100", OKSM = "#E9B44C", 
                                                  Senolytic = "#1B998B", Senolytic_OKSM = "#5D576B")),
           cluster_rows = TRUE,
           cluster_cols = T,
           cellwidth = 13,
           cellheight = 13)

PCA

ntop = 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select,]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)

pca_data <- plotPCA(rld, intgroup = c("condition", "replicate"), returnData=TRUE)
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)
ggplot(pca_data, aes(PC1, PC2, color=condition, shape=replicate)) + geom_point(size=3) +
  scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance")) +
  scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_classic() + geom_text(data = pca_data, aes(PC1,PC2, label = name), hjust = 1.2)

Removing TPM Outliers

effective_lengths = matrix(0, ncol=length(experimental_metadata$sample_id), nrow=17714)
colnames(effective_lengths)= experimental_metadata$sample_id
for( i in experimental_metadata$sample_id){
  effective_lengths[,i] = read.table(paste("data/isc/", i, ".genes.results",sep=""), sep="\t", header=TRUE)$effective_length
}
row.names(effective_lengths) = read.table(paste("data/isc/", i, ".genes.results",sep=""), sep="\t", header=TRUE)$gene_id

effective_lengths = rowMeans(effective_lengths[row.names(counts(dds)),])
ncrpk = counts(dds) / (effective_lengths / 1000)
ncrpk = apply(ncrpk, c(1,2), function(x){if(is.nan(x)){0}else{x}})
ncrpk = apply(ncrpk, c(1,2), function(x){if(is.infinite(x)){0}else{x}})
ncscalingfactor = colSums(ncrpk) / 1e6
nctpm = sweep(ncrpk, 2, ncscalingfactor, "/")

nctpm.melt = melt(nctpm)
ggplot(nctpm.melt, aes(x=Var2, y=value)) + geom_boxplot() + theme_classic() + theme(axis.text.x = element_text(angle = 90, colour="black", hjust = 1)) + scale_x_discrete("Sample") + scale_y_continuous("TPM")

tpm.threshold = 20000
test.tpm = apply(nctpm, 1, function(x){ any(x> tpm.threshold) })
ensembl.genes[names(test.tpm[test.tpm])] %>%
  as.data.frame() %>%
  datatable(options = list(scrollX = TRUE))

Differential Expression Analysis

Wald Tests

OKSM vs Control(TdTom)

generate_de_section(dds_wald, "OKSM", "Control")
## [1] "Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)): 1258"

Senolytic vs Control(TdTom)

generate_de_section(dds_wald, "Senolytic", "Control")
## [1] "Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)): 1399"

Senolytic OKSM vs Control(TdTom)

generate_de_section(dds_wald, "Senolytic_OKSM", "Control")
## [1] "Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)): 1311"

OKSM vs Senolytic

generate_de_section(dds_wald, "OKSM", "Senolytic")
## [1] "Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)): 124"

OKSM vs Senolytic_OKSM

generate_de_section(dds_wald, "OKSM", "Senolytic_OKSM")
## [1] "Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)): 109"

Senolytic vs Senolytic_OKSM

generate_de_section(dds_wald, "Senolytic", "Senolytic_OKSM")
## [1] "Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)): 0"

Likelihood Ratio Test

dds_LRT = nbinomLRT(dds, reduced = ~1)
res = results(dds_LRT)

res$gene_biotype= ensembl.genes$gene_biotype[match(row.names(res), ensembl.genes$gene_id)]
res$external_gene_name= ensembl.genes$external_gene_name[match(row.names(res), ensembl.genes$gene_id)]

hist(res$pvalue)

Number of significant genes (padj < 0.1):

sum(res$padj < 0.1, na.rm=T)
## [1] 2967

Visualisation

# assay(x) to access the count data
sig_mat_rld = assay(significant_rld)

# The apply function swaps the rows to samples and columns to genes -- the standard is the other way around: samples in cols and genes in rows, hence the transpose function
zscores = t(apply(sig_mat_rld, 1, function(x){ (x - mean(x)) / sd(x) }))

Elbow Plot

foo = as(zscores, "matrix")
bar = sapply(1:10, function(x){kmeans(foo, centers=x)$tot.withinss})
plot(bar, type="l")

Heatmap

pam_clust <- generate_data(zscores, 7, "pam")
saveRDS(pam_clust, "output/isc/pam_clust.rds")
# pam_clust <- as.data.frame(pam_clust)
# pam_clust$Cluster <- factor(pam_clust$Cluster, levels = c(5,4,3,1,2,6))
# pam_clust <- pam_clust[order(pam_clust$Cluster),]

pheatmap(pam_clust[,1:(ncol(pam_clust)-1)],
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
         fontsize_row = 5.5,
         annotation_col = annotation,
         annotation_colors = anno_colours,
         cluster_rows = FALSE,
         cluster_cols = FALSE)

Table of Genes

pam_clust_df <- pam_clust %>%
  as.data.frame() %>%
  mutate(gene_name = ensembl.genes[rownames(.),]$external_gene_name) %>%
  dplyr::select(gene_name, Cluster) %>%
  dplyr::rename("Gene Name" = gene_name)
datatable(pam_clust_df, options = list(scrollX = TRUE), class = "white-space: nowrap")

Number of Genes

c1 <- pam_clust_df[pam_clust_df$Cluster == 1, ] %>%
  dplyr::select(-Cluster)

c2 <- pam_clust_df[pam_clust_df$Cluster == 2, ] %>%
  dplyr::select(-Cluster)

c3 <- pam_clust_df[pam_clust_df$Cluster == 3, ] %>%
  dplyr::select(-Cluster)

c4 <- pam_clust_df[pam_clust_df$Cluster == 4, ] %>%
  dplyr::select(-Cluster)

c5 <- pam_clust_df[pam_clust_df$Cluster == 5, ] %>%
  dplyr::select(-Cluster)

c6 <- pam_clust_df[pam_clust_df$Cluster == 6, ] %>%
  dplyr::select(-Cluster)

c7 <- pam_clust_df[pam_clust_df$Cluster == 7, ] %>%
  dplyr::select(-Cluster)

data.frame(Cluster = c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", 
                       "Cluster 5", "Cluster 6", "Cluster 7", "Total"),
                        Number_of_genes = c(nrow(c1), nrow(c2), nrow(c3), nrow(c4),
                                            nrow(c5), nrow(c6), nrow(c7),
                                            sum(c(nrow(c1),nrow(c2),nrow(c3),nrow(c4),
                                                  nrow(c5),nrow(c6),nrow(c7)))))
##     Cluster Number_of_genes
## 1 Cluster 1             773
## 2 Cluster 2             397
## 3 Cluster 3             128
## 4 Cluster 4             566
## 5 Cluster 5             613
## 6 Cluster 6             248
## 7 Cluster 7             242
## 8     Total            2967

Silhouette Plot

## Checking the stability of clusters
#fviz_nbclust(zscores, kmeans, method = "silhouette")
set.seed(2)
dd = as.dist((1 - cor(t(zscores)))/2)
pam = pam(dd, 7, diss = TRUE)
sil = silhouette(pam$clustering, dd)
plot(sil, border=NA, main = "Silhouette Plot for 7 Clusters")

Functional Enrichments

#listEnrichrSites()
setEnrichrSite("FlyEnrichr")
dbs <- listEnrichrDbs()
to_check <- c("GO_Biological_Process_2018", "KEGG_2019")
output_dir <- "output/isc/"

Cluster 1

GO_Biological_Process_2018

eresList$GO_Biological_Process_2018 %>%
  plot_enrichr("GO_Biological_Process_2018")

datatable(eresList[[1]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$GO_Biological_Process_2018, output_dir, "isc_c1_go")
c1_go <- get_important_terms(eresList$GO_Biological_Process_2018)

KEGG_2019

eresList$KEGG_2019 %>%
  plot_enrichr("KEGG_2019") 

datatable(eresList[[2]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$KEGG_2019, output_dir, "isc_c1_kegg")
c1_kegg <- get_important_terms(eresList$KEGG_2019)

Cluster 2

GO_Biological_Process_2018

eresList$GO_Biological_Process_2018 %>%
  plot_enrichr("GO_Biological_Process_2018")

datatable(eresList[[1]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$GO_Biological_Process_2018, output_dir, "isc_c2_go")
c2_go <- get_important_terms(eresList$GO_Biological_Process_2018)

KEGG_2019

eresList$KEGG_2019 %>%
  plot_enrichr("KEGG_2019") 

datatable(eresList[[2]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$KEGG_2019, output_dir, "isc_c2_kegg")
c2_kegg <- get_important_terms(eresList$KEGG_2019)

Cluster 3

GO_Biological_Process_2018

eresList$GO_Biological_Process_2018 %>%
  plot_enrichr("GO_Biological_Process_2018")

datatable(eresList[[1]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$GO_Biological_Process_2018, output_dir, "isc_c3_go")
c3_go <- get_important_terms(eresList$GO_Biological_Process_2018)

KEGG_2019

eresList$KEGG_2019 %>%
  plot_enrichr("KEGG_2019") 

datatable(eresList[[2]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$KEGG_2019, output_dir, "isc_c3_kegg")
c3_kegg <- get_important_terms(eresList$KEGG_2019)

Cluster 4

GO_Biological_Process_2018

eresList$GO_Biological_Process_2018 %>%
  plot_enrichr("GO_Biological_Process_2018")

datatable(eresList[[1]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$GO_Biological_Process_2018, output_dir, "isc_c4_go")
c4_go <- get_important_terms(eresList$GO_Biological_Process_2018)

KEGG_2019

eresList$KEGG_2019 %>%
  plot_enrichr("KEGG_2019") 

datatable(eresList[[2]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$KEGG_2019, output_dir, "isc_c4_kegg")
c4_kegg <- get_important_terms(eresList$KEGG_2019)

Cluster 5

GO_Biological_Process_2018

eresList$GO_Biological_Process_2018 %>%
  plot_enrichr("GO_Biological_Process_2018")

datatable(eresList[[1]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$GO_Biological_Process_2018, output_dir, "isc_c5_go")
c5_go <- get_important_terms(eresList$GO_Biological_Process_2018)

KEGG_2019

eresList$KEGG_2019 %>%
  plot_enrichr("KEGG_2019") 

datatable(eresList[[2]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$KEGG_2019, output_dir, "isc_c5_kegg")
c5_kegg <- get_important_terms(eresList$KEGG_2019)

Cluster 6

GO_Biological_Process_2018

eresList$GO_Biological_Process_2018 %>%
  plot_enrichr("GO_Biological_Process_2018")

datatable(eresList[[1]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$GO_Biological_Process_2018, output_dir, "isc_c6_go")
c6_go <- get_important_terms(eresList$GO_Biological_Process_2018)

KEGG_2019

eresList$KEGG_2019 %>%
  plot_enrichr("KEGG_2019") 

datatable(eresList[[2]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$KEGG_2019, output_dir, "isc_c6_kegg")
c6_kegg <- get_important_terms(eresList$KEGG_2019)

Cluster 7

GO_Biological_Process_2018

eresList$GO_Biological_Process_2018 %>%
  plot_enrichr("GO_Biological_Process_2018")

datatable(eresList[[1]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$GO_Biological_Process_2018, output_dir, "isc_c7_go")
c7_go <- get_important_terms(eresList$GO_Biological_Process_2018)

KEGG_2019

eresList$KEGG_2019 %>%
  plot_enrichr("KEGG_2019") 

datatable(eresList[[2]], options = list(scrollX = TRUE), class = "white-space: nowrap")
get_important_terms_rds(eresList$KEGG_2019, output_dir, "isc_c7_kegg")
c7_kegg <- get_important_terms(eresList$KEGG_2019)

Summary Enrichments

GO

all_go = list(c1 = c1_go, c2=c2_go, c3=c3_go, c4=c4_go, c5=c5_go, c6=c6_go, c7=c7_go)
all_go <- all_go %>%
  bind_rows(.id = "Cluster") %>%
  group_by(Cluster) %>%
  slice_min(order_by = Adjusted.P.value, n = 10)
all_go %>%
  mutate(Term = gsub("\\([^()]*\\)", "", Term),
         Term = str_to_title(Term)) %>%
  mutate(Cluster = factor(Cluster, levels = c(unique(all_go$Cluster)))) %>%
  group_by(Cluster) %>%
  mutate(Term = factor(Term, levels = Term)) %>%
  ggplot(aes(x = Cluster, y = Term)) +
  geom_point(aes(colour = Adjusted.P.value, size = Ratio)) + 
  ylab(NULL) + 
  xlab("Cluster") +
  scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=FALSE)) + 
  scale_y_discrete(labels = function(x) str_wrap(x, width = 60),
                   limits=rev) +
  #labs(fill = "Adjusted p-value") +
  guides(colour = guide_colorbar(title = "Adjusted p-value", reverse = TRUE),
         size = guide_legend(title = "Gene Ratio")) +
  theme(axis.text.x = element_text(size=8)) +
  theme_bw() +
  ggtitle("GO Enrichment Analysis")

KEGG

all_kegg = list(c1 = c1_kegg, c2=c2_kegg, c3=c3_kegg, c4=c4_kegg, c5=c5_kegg, c6=c6_kegg, c7=c7_kegg)
all_kegg <- all_kegg %>%
  bind_rows(.id = "Cluster") %>%
  group_by(Cluster) %>%
  slice_min(order_by = Adjusted.P.value, n = 10)
all_kegg %>%
  mutate(Term = gsub("\\([^()]*\\)", "", Term),
         Term = str_to_title(Term)) %>%
  mutate(Cluster = factor(Cluster, levels = c(unique(all_kegg$Cluster)))) %>%
  group_by(Cluster) %>%
  mutate(Term = factor(Term, levels = Term)) %>%
  ggplot(aes(x = Cluster, y = Term)) +
  geom_point(aes(colour = Adjusted.P.value, size = Ratio)) + 
  ylab(NULL) + 
  xlab("Cluster") +
  scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=FALSE)) + 
  scale_y_discrete(labels = function(x) str_wrap(x, width = 60),
                   limits=rev) +
  #labs(fill = "Adjusted p-value") +
  guides(colour = guide_colorbar(title = "Adjusted p-value", reverse = TRUE),
         size = guide_legend(title = "Gene Ratio")) +
  theme(axis.text.x = element_text(size=8)) +
  theme_bw() +
  ggtitle("GO Enrichment Analysis")

Session Info

sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] enrichR_3.0                           DT_0.17                              
##  [3] RColorBrewer_1.1-2                    scales_1.1.1                         
##  [5] reshape2_1.4.4                        knitr_1.33                           
##  [7] biomaRt_2.46.3                        GenomicFeatures_1.42.3               
##  [9] AnnotationDbi_1.52.0                  genefilter_1.72.1                    
## [11] DESeq2_1.30.1                         SummarizedExperiment_1.20.0          
## [13] Biobase_2.50.0                        MatrixGenerics_1.2.1                 
## [15] matrixStats_0.58.0                    BSgenome.Dmelanogaster.UCSC.dm6_1.4.1
## [17] BSgenome_1.58.0                       rtracklayer_1.50.0                   
## [19] GenomicRanges_1.42.0                  GenomeInfoDb_1.26.7                  
## [21] Biostrings_2.58.0                     XVector_0.30.0                       
## [23] IRanges_2.24.1                        S4Vectors_0.28.1                     
## [25] BiocGenerics_0.36.0                   forcats_0.5.1                        
## [27] stringr_1.4.0                         dplyr_1.0.5                          
## [29] purrr_0.3.4                           readr_1.4.0                          
## [31] tidyr_1.1.3                           tibble_3.1.0                         
## [33] tidyverse_1.3.0                       EnhancedVolcano_1.8.0                
## [35] ggrepel_0.9.1                         ggplot2_3.3.3                        
## [37] pheatmap_1.0.12                       cluster_2.1.1                        
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1             backports_1.2.1          BiocFileCache_1.14.0    
##   [4] plyr_1.8.6               splines_4.0.5            crosstalk_1.1.1         
##   [7] BiocParallel_1.24.1      digest_0.6.27            htmltools_0.5.1.1       
##  [10] fansi_0.4.2              magrittr_2.0.1           memoise_2.0.0           
##  [13] openxlsx_4.2.3           annotate_1.68.0          modelr_0.1.8            
##  [16] extrafont_0.17           extrafontdb_1.0          askpass_1.1             
##  [19] prettyunits_1.1.1        colorspace_2.0-0         blob_1.2.1              
##  [22] rvest_1.0.0              rappdirs_0.3.3           haven_2.3.1             
##  [25] xfun_0.22                crayon_1.4.1             RCurl_1.98-1.3          
##  [28] jsonlite_1.7.2           survival_3.2-10          glue_1.4.2              
##  [31] gtable_0.3.0             zlibbioc_1.36.0          DelayedArray_0.16.3     
##  [34] proj4_1.0-10.1           car_3.0-10               Rttf2pt1_1.3.8          
##  [37] maps_3.3.0               abind_1.4-5              DBI_1.1.1               
##  [40] rstatix_0.7.0            Rcpp_1.0.6               xtable_1.8-4            
##  [43] progress_1.2.2           foreign_0.8-81           bit_4.0.4               
##  [46] htmlwidgets_1.5.3        httr_1.4.2               ellipsis_0.3.1          
##  [49] farver_2.1.0             pkgconfig_2.0.3          XML_3.99-0.6            
##  [52] sass_0.3.1               dbplyr_2.1.1             locfit_1.5-9.4          
##  [55] utf8_1.2.1               labeling_0.4.2           tidyselect_1.1.0        
##  [58] rlang_0.4.10             munsell_0.5.0            cellranger_1.1.0        
##  [61] tools_4.0.5              cachem_1.0.4             cli_2.4.0               
##  [64] generics_0.1.0           RSQLite_2.2.6            broom_0.7.6             
##  [67] evaluate_0.14            fastmap_1.1.0            yaml_2.2.1              
##  [70] bit64_4.0.5              fs_1.5.0                 zip_2.1.1               
##  [73] ash_1.0-15               ggrastr_0.2.3            xml2_1.3.2              
##  [76] compiler_4.0.5           rstudioapi_0.13          beeswarm_0.3.1          
##  [79] curl_4.3                 ggsignif_0.6.1           reprex_2.0.0            
##  [82] geneplotter_1.68.0       bslib_0.2.4              stringi_1.5.3           
##  [85] highr_0.8                ggalt_0.4.0              lattice_0.20-41         
##  [88] Matrix_1.3-2             vctrs_0.3.7              pillar_1.6.0            
##  [91] lifecycle_1.0.0          jquerylib_0.1.3          data.table_1.14.0       
##  [94] bitops_1.0-6             R6_2.5.0                 rio_0.5.26              
##  [97] KernSmooth_2.23-18       vipor_0.4.5              MASS_7.3-53.1           
## [100] assertthat_0.2.1         rjson_0.2.20             openssl_1.4.3           
## [103] rprojroot_2.0.2          withr_2.4.1              GenomicAlignments_1.26.0
## [106] Rsamtools_2.6.0          GenomeInfoDbData_1.2.4   hms_1.0.0               
## [109] grid_4.0.5               rmarkdown_2.7            carData_3.0-4           
## [112] ggpubr_0.4.0             lubridate_1.7.10         ggbeeswarm_0.6.0